Reference: https://www.kaggle.com/c/web-traffic-time-series-forecasting/data
I have cleaned the kaggle wikipedia traffic data and selected only data of 2016 with fraction of 0.1.
The data was melted and additional columns were created.
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
sns.set(context='notebook', style='whitegrid', rc={'figure.figsize': (12,8)})
plt.style.use('fivethirtyeight') # better than sns styles.
matplotlib.rcParams['figure.figsize'] = 12,8
import os
import time
# random state
random_state=100
np.random.seed(random_state)
# Jupyter notebook settings for pandas
#pd.set_option('display.float_format', '{:,.2g}'.format) # numbers sep by comma
from pandas.api.types import CategoricalDtype
np.set_printoptions(precision=3)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100) # None for all the rows
pd.set_option('display.max_colwidth', 200)
import IPython
from IPython.display import display, HTML, Image, Markdown
print([(x.__name__,x.__version__) for x in [np, pd,sns,matplotlib]])
import dask
import dask.dataframe as dd
import gc
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;
def show_method_attributes(method, ncols=7,start=None):
""" Show all the attributes of a given method.
Example:
========
show_method_attributes(list)
"""
x = [I for I in dir(method) if I[0]!='_' ]
x = [I for I in x
if I not in 'os np pd sys time psycopg2'.split() ]
if start:
x = [I for I in x if I.startswith(start)]
return pd.DataFrame(np.array_split(x,ncols)).T.fillna('')
df = pd.read_csv('../../data/wiki/processed/data_cleaned_2016_frac01.csv',
parse_dates=['date'])
print(df.shape) # 5.3 million rows, 21 cols
df.head()
df.dtypes
df.memory_usage(deep=True).sum() * 1e-6 # MB
# all the year is 2016,drop it.
df.drop('year',axis=1,inplace=True)
cols_int = ['visits']
cols_cat = ['month','day','quarter','day_name','month_name',
'project','access','agent','language']
cols_float = ['mean','median']
for c in cols_int:
df[c] = df[c].astype(np.int32)
for c in cols_float:
df[c] = df[c].astype(np.float32)
for c in cols_cat:
df[c] = df[c].astype(pd.api.types.CategoricalDtype())
# make day_name and month_name ordered categorical
df['day_name'].unique()
cats = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
df['day_name'] = pd.Categorical(df['day_name'], ordered=True, categories=cats)
df['day_name'].unique()
df['month_name'].unique()
show_method_attributes(df['month_name'].unique())
df['month_name'].unique().categories
cats = ['January', 'February', 'March', 'April', 'May', 'June', 'July',
'August', 'September', 'October', 'November', 'December']
df['month_name'] = pd.Categorical(df['month_name'], ordered=True, categories=cats)
df['month_name'].unique()
df.memory_usage(deep=True).sum() * 1e-6 # MB
for c in cols_cat:
print(c)
print(df[c].value_counts().sort_index())
print()
print(df.shape)
df.head()
df['Page'].nunique() # there are 14.5k unique pages visited in 2016
df.groupby('Page')['visits'].sum()
df.groupby('Page')['visits'].sum().sort_values(ascending=False)
df.groupby('Page')['visits'].sum().nlargest(5)
df.query(""" Page == 'Special:Search_en.wikipedia.org_desktop_all-agents' """).head()
# df.groupby('language')['visits'].nlargest(5)
# df.groupby('language')['visits'].apply(lambda x: x.nlargest(5))
# df.groupby('language')['visits'].apply(lambda x: x.nlargest(5).index)
df.groupby('language')['visits'].apply(lambda x: df.loc[x.nlargest(5).index])
idx = df.groupby('Page')['visits'].sum().idxmax()
df.query(""" Page == @idx """).head()
ts = df.query(""" Page == @idx """)[['date','visits']].set_index('date')
print(ts.shape)
ts.head()
ts.plot()
# ts is periodic
# ts has some very large peaks
# ts in not going upward, it does not have trend (it may have if I have more years)
ts.groupby(ts.index.month).plot();
df.head(2)
fname_lang_monthly_mean = '../reports/figures/2016_sample001_monthly_visits.png'
if not os.path.isfile(fname_lang_monthly_mean):
plt.figure(figsize=(12,12))
sns.pointplot(x="month_name", y="visits", hue='language', data=df,estimator='mean')
plt.savefig(fname_lang_monthly_mean, dpi=300)
Image(fname_lang_monthly_mean)
# df.groupby(['month_name', 'language'])['visits'].sum().unstack().reset_index()
# TypeError: cannot insert an item into a CategoricalIndex that is not already an existing category
df.groupby(['month_name', 'language'])['visits'].mean().unstack()
ax = df.groupby(['month_name', 'language'])['visits'].mean().unstack()\
.reset_index(drop=True).plot(figsize=(12,12), logy=False )
plt.xticks(range(12), rotation=90)
ax.set_xticklabels(df['month_name'].unique());
# exclude english and compare others
# plt.figure(figsize=(12,12))
# df.query("""language != 'English'""").pipe((sns.pointplot,'data'), x='month_name',
# y='visits',hue='language')
# # ValueError: 'c' argument has 12 elements, which is not acceptable for use with 'x' with size 0, 'y' with size 0.
ax = df.query("""language != 'English'""")\
.groupby(['month_name', 'language'])['visits'].mean().unstack()\
.reset_index(drop=True).plot(figsize=(12,12), logy=False )
plt.xticks(range(12), rotation=90)
ax.set_xticklabels(df['month_name'].unique());
df.groupby(['dayofyear', 'language'])['visits'].mean().unstack().plot(figsize=(12,12))
plt.savefig('../reports/figures/daily_visits.png',dpi=300)
%%time
# plt.figure(figsize=(12,12))
# sns.lineplot(x='dayofyear',y='visits',hue='language',data=df)
# Wall time: 2min 27s
df['day_name'].unique().categories
%%time
sns.barplot(x='day_name',y='visits',data=df,
order=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday',
'Sunday'])
plt.savefig('../reports/figures/visits_by_weekday.png')
%%time
df.groupby('day_name')['visits'].mean().sort_index().plot.bar()
%%time
ax = df.groupby('day_name')['visits'].mean().sort_index(ascending=False)\
.plot.barh(use_index=True, color=sns.color_palette('husl',7))
plt.tick_params(axis='y', which='both', labelright='on')
df.groupby(['day_name','month_name'])['visits'].mean().sort_index(ascending=False).unstack().plot.bar()
%%time
sns.catplot(data=df, kind='bar',ci=None,
x='day_name',y='visits',hue='month_name')
plt.xticks(rotation=90)
plt.savefig('../reports/figures/visits_per_month_per_weekday.png')
df.groupby(['day_name','month_name'])['visits'].mean().sort_index(ascending=False).unstack(0).plot.bar()
df1 = df.groupby(["day_name", "day"])['visits'].mean().reset_index()\
.pivot('day','day_name','visits').dropna()
fig, ax = plt.subplots(figsize=(50, 30))
sns.heatmap(data=df1, annot=False, ax=ax, fmt="d", linewidths=2).invert_yaxis()
plt.title('Web Traffic per Days of Week',fontsize=28)
plt.xlabel('Week Day Name', fontsize=28)
plt.ylabel('Day of Month', fontsize=28)
plt.xticks(fontsize=28)
plt.yticks(fontsize=28)
cbar = ax.collections[0].colorbar
cbar.ax.tick_params(labelsize=28)
plt.savefig('../reports/figures/visits_per_dayofmonth_per_dayofweek.png')
plt.show()
df1 = df.groupby(["month_name", "day"])['visits'].mean().reset_index()\
.pivot('day','month_name','visits').dropna()
fig, ax = plt.subplots(figsize=(50, 30))
# sns.set(font_scale=3)
sns.heatmap(data=df1, annot=False, ax=ax, fmt="d", linewidths=2).invert_yaxis()
plt.title('Web Traffic for Months per days of month',fontsize=28)
plt.xlabel('Month Name', fontsize=28)
plt.ylabel('Day of Month', fontsize=28)
plt.xticks(fontsize=28)
plt.yticks(fontsize=28)
cbar = ax.collections[0].colorbar
cbar.ax.tick_params(labelsize=28)
plt.show()
Resources:
Here we can that the plots seems periodic in nature in time domain. We can work in the frequency domain using FFT transformation of the time series. Peaks in the FFT show us the strongest frequencies in the periodic signal.
The Fourier transform is an alternative representation of a signal as a superposition of periodic components. It is an important mathematical result that any well-behaved function can be represented under this form. Whereas a time-varying signal is most naturally considered as a function of time, the Fourier transform represents it as a function of the frequency. A magnitude and a phase, which are both encoded in a single complex number, are associated to each frequency.
The Discrete Fourier Transform
Let's consider a digital signal x represented by a vector $(x0,...,xN−1)$. We assume that this signal is regularly sampled. The Discrete Fourier Transform (DFT) of x is $X=(X0,...,XN−1)$ defined as:
The DFT can be computed efficiently with the Fast Fourier Transform (FFT), an algorithm that exploits symmetries and redundancies in this definition to considerably speed up the computation. The complexity of the FFT is $O(NlogN)$ instead of $O(N^2)$ for the naive DFT. The FFT is one of the most important algorithms of the digital universe.
days = df['dayofyear'].unique()
df_daily = df.groupby(['dayofyear', 'language'])['visits'].mean().unstack()
df_daily.head()
df_daily.columns
def plot_with_fft(df_daily, col):
from scipy.fftpack import fft
fig = plt.figure(1,figsize=[15,5])
plt.ylabel('Views per Page')
plt.xlabel('Day')
plt.title(col)
plt.xticks(range(0,370,10))
plt.plot(days, df_daily[col].to_numpy(),label = col )
fig = plt.figure(2,figsize=[15,5])
fft_complex = fft(df_daily[col].to_numpy())
fft_mag = [np.sqrt(np.real(x)*np.real(x)+np.imag(x)*np.imag(x))
for x in fft_complex]
fft_xvals = [day / days[-1] for day in days]
npts = len(fft_xvals) // 2 + 1
fft_mag = fft_mag[:npts]
fft_xvals = fft_xvals[:npts]
plt.ylabel('FFT Magnitude')
plt.xlabel(r"Frequency [days]$^{-1}$")
plt.title('Fourier Transform')
plt.plot(fft_xvals[1:],fft_mag[1:],label = col )
# Draw lines at 1, 1/2, and 1/3 week periods
plt.axvline(x=1./7,color='red',alpha=0.3)
plt.axvline(x=2./7,color='red',alpha=0.3)
plt.axvline(x=3./7,color='red',alpha=0.3)
plt.show()
for col in df_daily.columns:
plot_with_fft(df_daily, col)